msaData <- read_csv("Stanford_MSA_Database.csv")
head(msaData)
msaData %>% dplyr::select(-Description, -'Day of Week', -'Shooter Name', -'Type of Gun - Detailed', -'Targeted Victim/s - Detailed', -'Possible Motive - Detailed', -'History of Mental Illness - Detailed', -starts_with("Data "), -Notes) -> msaDataTrunc
head(msaDataTrunc)
library(rworldmap)
library(ggmap)
states <- map_data(map="state")
#note: mapping algs are very sensitive to variable names
#in our dataset, it's called "State". BUT, we need it to be "region"
msaDataTrunc %>% rename(region = State) -> msaDataTrunc
library(openintro) ## students will need to install this library
msaDataTrunc$region %>%
tolower() ->
msaDataTrunc$region
states <- left_join(states, count(msaDataTrunc, region)) #joining map data with shooting data
head(states)
#Note: needn to determine color on the map. More shootings means brighter color!
ggplot() +
geom_map(data=states,
map=states,
aes(map_id=region, x=long, y=lat, fill=n)) +
scale_fill_gradient(low="purple", high="orange", guide="colourbar")+
labs(x = "Longitude", y="Latitude", title ="Cloropleth Map of Mass Shootings")
us <- c(left = -126, bottom = 24.5, right = -66, top = 50)
myMap <- get_stamenmap(us, zoom=5, maptype="toner-lite")
ggmap(myMap) +
geom_density2d(data=msaDataTrunc, aes(x=Longitude, y=Latitude), size=0.3)+
scale_fill_gradient(low = "blue", high = "orange") +
scale_alpha(range = c(0, 0.3), guide = FALSE)+
labs(x="Longitude", y="Latitude", title="Heat Map of US Mass Shooting Fatalities")+geom_point(data = msaDataTrunc, aes(x=Longitude, y=Latitude), alpha=.5, size=1)
us <- c(left = -119.5, bottom = 33, right = -116, top = 34.5)
myMap <- get_stamenmap(us, zoom=10, maptype="terrain")
help("get_stamenmap")
ggmap(myMap) +
geom_density2d(data=msaDataTrunc, aes(x=Longitude, y=Latitude), size=0.3)+
stat_density2d(data=msaDataTrunc,
aes(x=Longitude, y=Latitude, fill=..level.., alpha=..level..),
size=0.01, bins=20,
geom="polygon") +
scale_fill_gradient(low = "blue", high = "orange") +
scale_alpha(range = c(0, 0.3), guide = FALSE)+
labs(x="Longitude", y="Latitude", title="Heat Map of US Mass Shooting Fatalities")+geom_point(data = msaDataTrunc, aes(x=Longitude, y=Latitude), alpha=.5, size=1)
garlicData <- read_csv("GarlicMustardData.csv")
us <- c(left = -126, bottom = 24.5, right = -66, top = 50)
myMap <- get_stamenmap(us, zoom=5, maptype="terrain")
ggmap(myMap) +
geom_density2d(data=msaDataTrunc, aes(x=Longitude, y=Latitude), size=0.3)+
stat_density2d(data=msaDataTrunc,
aes(x=Longitude, y=Latitude, fill=..level.., alpha=..level..),
size=0.01, bins=10,
geom="polygon") +
scale_fill_gradient(low = "blue", high = "orange") +
scale_alpha(range = c(0, 0.3), guide = FALSE)+
labs(x="Longitude", y="Latitude", title="Heat Map of US Mass Shooting Fatalities")+
geom_point(data = msaDataTrunc, aes(x=Longitude, y=Latitude), alpha=.5, size=1)+
annotate('rect', xmin=-97, ymin=31, xmax=-99, ymax=35, col="red", fill="transparent")+
annotate("text", x=-95, y=40, label="this is text", size=3)+
annotate('segment', x=-97.12,xend=-97.12, y=31.55,yend=31.55, arrow=arrow(length=unit(0.3,"cm")), size = 1)+
annotate('segment', x=-97,xend=-97.2, y=31.55,yend=31.55, size = 1)
Let’s play with the garlic mustard data
garlicData <- read_csv("GarlicMustardData.csv")
head(garlicData)
us <- c(left = -126, bottom = 24.5, right = -66, top = 50)
myMap <- get_stamenmap(us, zoom=5, maptype="terrain")
ggmap(myMap)+
geom_point(data = garlicData, aes(x=Longitude, y=Latitude, color=AvgAdultHeight), alpha=.5, size=1)+
annotate("segment", x=-88, y=35, xend=-81, yend = 39, color="orange", arrow=arrow(length=unit(.3,"cm")))
You guys know how to search in google. And you know how to filter. Ex: iris. Search for all plants with species “setosa”
iris %>% filter(Species=="setosa")
What if you want to do something more advanced?
Let’s practice with the babynames dataset.
library(babynames)
head(babynames)
Example: what if we wanted to find all names with three vowels in a row?
There’s no specific string we could use, there’s lots of possibilities:
aaa
eee
aou
….
Regular expressions are a pattern matching language. Super cool! They give you a whole language to describe what patterns you’re looking for in data.
Useful on your final project especially: what if you’re looking for specific types of data?
If we want to search for vowels, use this regexp:
[aeiou]
This matches any vowel. To find three in a row, stick it together:
[aeiou][aeiou][aeiou]
Let’s try. First, wrangling:
head(babynames)
babynames %>% group_by(name, sex) %>%
summarise(total=sum(n)) %>%
arrange(desc(total)) -> names
head(names)
names %>% filter(name=="Ali")
names %>% filter(grepl("[aeiou]{3}", name, ignore.case = TRUE))
Q: what if we want to find names with three consonants in a row? I.e., three non-vowels?
names %>% filter(grepl("[^aeiouy]{4}", name, ignore.case = TRUE))